From f1e7db06eb1f0adffc1a4986b91cafcd913c2c0f Mon Sep 17 00:00:00 2001 From: parkrrrr Date: Thu, 8 Jan 2004 16:48:44 +0000 Subject: [PATCH] Made xml_entitize understand and handle UTF-8 --- gpsbabel/util.c | 117 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 16 deletions(-) diff --git a/gpsbabel/util.c b/gpsbabel/util.c index 0ff2e6922..287e0ccd6 100644 --- a/gpsbabel/util.c +++ b/gpsbabel/util.c @@ -575,9 +575,53 @@ strsub(char *s, char *search, char *replace) return d; } + +void utf8_to_int( const char *cp, int *bytes, int *value ) +{ + if ( (*cp & 0xe0) == 0xc0 ) { + *bytes = 2; + *value = ((*cp & 0x1f) << 6) | + (*(cp+1) & 0x3f); + } + else if ( (*cp & 0xf0) == 0xe0 ) { + *bytes = 3; + *value = ((*cp & 0x0f) << 12) | + ((*(cp+1) & 0x3f) << 6) | + (*(cp+2) & 0x3f); + } + else if ( (*cp & 0xf8) == 0xf0 ) { + *bytes = 4; + *value = ((*cp & 0x07) << 18) | + ((*(cp+1) & 0x3f) << 12) | + ((*(cp+2) & 0x3f) << 6) | + (*(cp+3) & 0x3f); + } + else if ( (*cp & 0xfc) == 0xf8 ) { + *bytes = 5; + *value = ((*cp & 0x03) << 24) | + ((*(cp+1) & 0x3f) << 18) | + ((*(cp+2) & 0x3f) << 12) | + ((*(cp+3) & 0x3f) << 6) | + (*(cp+4) & 0x3f); + } + else if ( (*cp & 0xfe) == 0xfc ) { + *bytes = 6; + *value = ((*cp & 0x01) << 30) | + ((*(cp+1) & 0x3f) << 24) | + ((*(cp+2) & 0x3f) << 18) | + ((*(cp+3) & 0x3f) << 12) | + ((*(cp+4) & 0x3f) << 6) | + (*(cp+5) & 0x3f); + } + else { + *bytes = 1; + *value = (unsigned char)*cp; + } +} + char * xml_entitize(const char * str) { - int elen, ecount; + int elen, ecount, nsecount; const char ** ep; const char * cp; char * p, * tmp, * xstr; @@ -589,8 +633,11 @@ char * xml_entitize(const char * str) "\"", """, NULL, NULL }; + char tmpsub[20]; + int bytes = 0; + int value = 0; ep = stdentities; - elen = ecount = 0; + elen = ecount = nsecount = 0; /* figure # of entity replacements and additional size. */ while (*ep) { @@ -602,32 +649,70 @@ char * xml_entitize(const char * str) } ep += 2; } + + /* figure the same for other than standard entities (i.e. anything + * that isn't in the range U+0000 to U+007F */ + for ( cp = str; *cp; cp++ ) { + if ( *cp & 0x80 ) { + + utf8_to_int( cp, &bytes, &value ); + cp += bytes-1; + elen += sprintf( tmpsub, "&#x%x;", value ) - bytes; + nsecount++; + } + } /* enough space for the whole string plus entity replacements, if any */ tmp = xcalloc((strlen(str) + elen + 1), 1); strcpy(tmp, str); /* no entity replacements */ - if (ecount == 0) + if (ecount == 0 && nsecount == 0) return (tmp); - ep = stdentities; + if ( ecount != 0 ) { + ep = stdentities; - while (*ep) { - p = tmp; - while ((p = strstr(p, *ep)) != NULL) { - elen = strlen(*(ep + 1)); + while (*ep) { + p = tmp; + while ((p = strstr(p, *ep)) != NULL) { + elen = strlen(*(ep + 1)); - xstr = xstrdup(p + strlen(*ep)); + xstr = xstrdup(p + strlen(*ep)); - strcpy(p, *(ep + 1)); - strcpy(p + elen, xstr); + strcpy(p, *(ep + 1)); + strcpy(p + elen, xstr); - xfree(xstr); + xfree(xstr); - p += elen; - } - ep += 2; - } + p += elen; + } + ep += 2; + } + } + + if ( nsecount != 0 ) { + p = tmp; + while (*p) { + if ( *p & 0x80 ) { + utf8_to_int( p, &bytes, &value ); + if ( p[bytes] ) { + xstr = xstrdup( p + bytes ); + } + else { + xstr = NULL; + } + sprintf( p, "&#x%x;", value ); + p = p+strlen(p); + if ( xstr ) { + strcpy( p, xstr ); + xfree(xstr); + } + } + else { + p++; + } + } + } return (tmp); } -- 2.30.2